using Pkg
Pkg.activate(".")

using DataFrames, CSV, DataFramesMeta, StatsPlots, Downloads, ProgressMeter, Weave, 
    Loess, Arrow, Transducers, Memoization, ReverseDiff, Zygote, Turing, LinearAlgebra, ColorSchemes,
    Serialization

Getting Started With Julia for Data Analysis

In this project, we will download some files from the CA dept of Education which give averaged results for test scores in all schools in CA across various student groups within the schoool.

The data is available at: https://caaspp-elpac.cde.ca.gov/caaspp/ResearchFileListSB?ps=true&lstTestYear=2021&lstTestType=B&lstCounty=00&lstDistrict=00000#dl

We are going to get the Los Angeles county files which avoids unnecessary data.

Unfortunately some years have different "versions" and there is no way on the site to get a list of files, so we need to just by hand click through and find out the file names, for example:

https://caaspp-elpac.cde.ca.gov/caaspp/researchfiles/sbca2015all19csv_v3.zip for 2014-2015 LA County

https://caaspp-elpac.cde.ca.gov/caaspp/researchfiles/sbca2021all19csv_v2.zip for 2020-2021 LA County data...

there is no 2019-2020 data as COVID prevented the testing from happening.

versions = [2015 => 3, 2016 => 3, 2017 => 2, 2018 => 3, 2019 => 4 , 2021 => 2]


files = ["sb_ca$(vv[1])_all_19_csv_v$(vv[2]).zip" for vv in versions]
6-element Vector{String}:
 "sb_ca2015_all_19_csv_v3.zip"
 "sb_ca2016_all_19_csv_v3.zip"
 "sb_ca2017_all_19_csv_v2.zip"
 "sb_ca2018_all_19_csv_v3.zip"
 "sb_ca2019_all_19_csv_v4.zip"
 "sb_ca2021_all_19_csv_v2.zip"

We also need the entity file that describes the codes for each school/district etc and the codes for the tests and student group descriptions we'll assume for our analysis that none of the entities of interest changed during the time period and just get the recent ones.

push!(files,"sb_ca2021entities_csv.zip")
push!(files,"Tests.zip")
push!(files,"StudentGroups.zip")
9-element Vector{String}:
 "sb_ca2015_all_19_csv_v3.zip"
 "sb_ca2016_all_19_csv_v3.zip"
 "sb_ca2017_all_19_csv_v2.zip"
 "sb_ca2018_all_19_csv_v3.zip"
 "sb_ca2019_all_19_csv_v4.zip"
 "sb_ca2021_all_19_csv_v2.zip"
 "sb_ca2021entities_csv.zip"
 "Tests.zip"
 "StudentGroups.zip"

Downloading the files

We'll change directory into the data dir, download each of the files, unzip them, and then cd out of the directory.

We use try/finally to ensure that we wind up in the same directory even if there's an error, then read the directory to see what files we have...

try 
    cd("data")
    @showprogress 3 for f in files
        if ! ispath(f) # if we don't have the file already
            Downloads.download("https://caaspp-elpac.cde.ca.gov/caaspp/researchfiles/$(f)",f)
        end
        run(`unzip -u $(f)`)
    end
finally
    cd("..")
end

readdir("data")
Archive:  sb_ca2015_all_19_csv_v3.zip
Archive:  sb_ca2016_all_19_csv_v3.zip
Archive:  sb_ca2017_all_19_csv_v2.zip
Archive:  sb_ca2018_all_19_csv_v3.zip
Archive:  sb_ca2019_all_19_csv_v4.zip
Archive:  sb_ca2021_all_19_csv_v2.zip
Archive:  sb_ca2021entities_csv.zip
Archive:  Tests.zip
Archive:  StudentGroups.zip
19-element Vector{String}:
 "StudentGroups.txt"
 "StudentGroups.zip"
 "Tests.txt"
 "Tests.zip"
 "sb_ca2015_all_19_csv_v3.txt"
 "sb_ca2015_all_19_csv_v3.zip"
 "sb_ca2015_all_ascii_v3.zip"
 "sb_ca2016_all_19_csv_v3.txt"
 "sb_ca2016_all_19_csv_v3.zip"
 "sb_ca2017_all_19_csv_v2.txt"
 "sb_ca2017_all_19_csv_v2.zip"
 "sb_ca2018_all_19_csv_v3.txt"
 "sb_ca2018_all_19_csv_v3.zip"
 "sb_ca2019_all_19_csv_v4.txt"
 "sb_ca2019_all_19_csv_v4.zip"
 "sb_ca2021_all_19_csv_v2.txt"
 "sb_ca2021_all_19_csv_v2.zip"
 "sb_ca2021entities_csv.txt"
 "sb_ca2021entities_csv.zip"

We have all the files

Next, we need to read the data in, and try to subset down to the schools of interest. Unfortunately, the files don't all have the same columns, nor do they have the same delimeters! What do we do about that? DataFrames allows us to append!(df,df2) and handle the case where they don't have exactly the same columns by various methods by specifying the "cols" option (see docs) cols=:union causes the resulting dataset to have all the columns from both files, but with missing values where needed

intify(s::Int64) = s
function intify(s)
    x = tryparse(Int64,s)
    if isnothing(x)
        missing
    else
        x
    end
end

floatormiss(x::Float64) = x
function floatormiss(x)
    if ismissing(x)
        return missing
    end
    if isa(x,String)
        let v = tryparse(Float64,x);
            if isnothing(v)
                missing
            else
                v
            end
        end
    end
end

testscores = DataFrame()
entities = DataFrame()
tests = DataFrame()
students = DataFrame()

const cachedir = "/var/cache/userdata/dlakelan/juliacache"

if ! ispath(joinpath(cachedir,"tests.arrow")) # there are no cache files, build them.
    for f in filter(endswith(".txt"),readdir("data"))
        @show f
        if occursin("entities",f)
            global entities = CSV.read(joinpath("data",f),DataFrame; normalizenames=true, delim="^")
        elseif occursin("Student",f)
            global students = CSV.read(joinpath("data",f),DataFrame; normalizenames=true,delim="^")
        elseif occursin("Tests",f)
            global tests = CSV.read(joinpath("data",f),DataFrame; normalizenames=true,delim="^")
        else
            df = CSV.read(joinpath("data",f),DataFrame; normalizenames=true) # uses a standard "," delimiter
            append!(testscores,df; cols=:union) # we will collect all the various columns with missing data where they don't exist
        end
    end
    Arrow.write(joinpath(cachedir,"tests.arrow"),tests)
    Arrow.write(joinpath(cachedir,"students.arrow"),students)
    Arrow.write(joinpath(cachedir,"entities.arrow"),entities)
    Arrow.write(joinpath(cachedir,"testscores.arrow"),testscores)
end

# now we know we have cache files. read them (note doing it this way means we're always analyzing the data from the cache)

tests = DataFrame(Arrow.Table(joinpath(cachedir,"tests.arrow")))
students = DataFrame(Arrow.Table(joinpath(cachedir,"students.arrow")))
entities = DataFrame(Arrow.Table(joinpath(cachedir,"entities.arrow")))

# the dataframe for test scores is huge, 3.8 million rows and a lot of columns, we only care about these districts.
# we run the rows of the Arrow.Table through a Filter transducer that filters on District_Code and collect the ones we care about, saving gigabytes
# of RAM
ourentities = @subset(entities,in.(:District_Name, Ref(["Pasadena Unified","Glendale Unified", "Alhambra Unified"])))
#@show ourentities
let ourdistcodes = unique(ourentities.District_Code);
    global testscores = Tables.rows(Arrow.Table(joinpath(cachedir,"testscores.arrow"))) |> Filter(x -> !ismissing(x.District_Code) && x.District_Code in ourdistcodes && x.School_Code .!= 0) |> DataFrame
    @rtransform!(testscores, :Students_Tested = 
        if ismissing(:Students_Tested) 
            intify(:Total_Tested_at_Subgroup_Level)
        else
            intify(:Students_Tested)
        end,
        :CAASPP_Reported_Enrollment = intify(:CAASPP_Reported_Enrollment),
        :Total_Tested_At_Entity_Level = intify(:Total_Tested_At_Entity_Level),
        :Mean_Scale_Score = floatormiss(:Mean_Scale_Score))
end

"Done Loading Data..."
"Done Loading Data..."
pusdentities = @subset(entities,.! ismissing.(:District_Name) .&& :District_Name .== "Pasadena Unified" .&& .! ismissing.(:School_Name))
alhambraentities = @subset(entities,.! ismissing.(:District_Name) .&& :District_Name .== "Alhambra Unified" .&& .! ismissing.(:School_Name))
glendaleentities = @subset(entities,.! ismissing.(:District_Name) .&& :District_Name .== "Glendale Unified" .&& .! ismissing.(:School_Name))

pusdtests = leftjoin(@select(pusdentities,:School_Code,:School_Name,:District_Name),testscores,on = :School_Code, matchmissing=:notequal)
alhambratests = leftjoin(@select(alhambraentities,:School_Code,:School_Name,:District_Name),testscores,on = :School_Code, matchmissing=:notequal)
glendaletests = leftjoin(@select(glendaleentities,:School_Code,:School_Name,:District_Name),testscores,on = :School_Code, matchmissing=:notequal)

"Done subsetting..." # so we don't output the above table
"Done subsetting..."

Now, let's just plot points...

We want points for each test... TestID = 1 means english, 2 means Math (for "smarter balanced" assessment). Let's group the tests by cohort, meaning TestYear - (Grade - 3) basically the year they were in 3rd grade which is the first year you take the test.

function plotschools(entities,testscores)

    for (testid,testname) in Iterators.zip([1,2],["English","Math"])
        let pl = []
            for sch in eachrow(entities)
                ourdf = @subset(testscores,:School_Code .== sch.School_Code .&& :Subgroup_ID .== 1 .&& :Grade .< 13 .&& :Test_Id .== testid .&& .!ismissing.(:Mean_Scale_Score))
    #            @show ourdf
                if nrow(ourdf) > 0
                    p = @df ourdf plot(:Grade,:Mean_Scale_Score; xlim=(3,12),ylim = (2250,2750),group=:Test_Year .- (:Grade .- 3),legend=false,size=(250,250), title="$(testname): $(sch.School_Name)",linewidth=3)
                    p = @df ourdf scatter!(:Grade,:Mean_Scale_Score; xlim=(3,12),ylim = (2250,2750),group=:Test_Year .- (:Grade .- 3),legend=false,size=(250,250), title="$(testname): $(sch.School_Name)",markerstrokewidth=0)
                    push!(pl,p)

                end
            end
            display(plot(pl...; size=(1500,1500)))
        end
    end
end

plotschools(pusdentities,@subset(pusdtests,:Subgroup_ID .== 1))
plotschools(alhambraentities,@subset(alhambratests,:Subgroup_ID .== 1))

Clearly there are some differences between schools. Since we are looking at the average across the school some of this will be because the demographic and socioeconomic mix of the students is different. For example the middle schoolers at the relative wealthy community of Sierra Madre Middle School are testing about the same as the high school 11th graders at Blair and Marshall Fundamental.

Also Alhambra schools appear to have higher achievement in general.

A considerable difference in overall average test scores can be attributed to a different mix of students. So let's break down the schools by demographic groups. The DemographicID in the students table describes the StudentGroups. We can break this down by:

"Economically Disadvantaged" vs "Not economically disadvantaged"

Race and ethnicity

Parent Education Level

For now, let's focus on math scores, and we'll iterate over every school, and output a graph that shows the average curve for each parent education level:

function plotedlevel(schools,scores,dist)

    edlevnames = Dict(90 => "No HSD", 91 => "HSD", 92 => "Some College", 93 => "College Grad", 94=>"Grad School")

    for sch in eachrow(schools)
        ourdf = @subset(scores,:Test_Id .== 2 .&& :School_Code .== sch.School_Code .&& in.(:Subgroup_ID ,Ref(90:94)))
        p = @df ourdf scatter(:Grade,:Mean_Scale_Score; xlim=(2.5,12), ylim=(2300,2800),title="$(dist)\nMath $(sch.School_Name)\nBy Parent Ed",label=false,markersize=3,size=(500,500))
        for edlev in 90:94
            subs = @subset(ourdf,:Subgroup_ID .== edlev .&& .!ismissing.(:Grade) .&& .! ismissing.(:Mean_Scale_Score))
            if nrow(subs) < 4 
                continue
            end
            #println("There are $(nrow(subs)) observations for $(sch.School_Name)")
            grades = collect(minimum(subs.Grade):maximum(subs.Grade))
            if length(grades) > 2
                l = loess(Float64.(subs.Grade),Float64.(subs.Mean_Scale_Score))
                p = plot!(grades,map(x -> Loess.predict(l,x), Float64.(grades)); label=edlevnames[edlev], linewidth=3)
            end
        end


        display(p)
    end

end


plotedlevel(pusdentities,pusdtests,"PUSD")
plotedlevel(alhambraentities,alhambratests,"Alhambra")
plotedlevel(glendaleentities,glendaletests,"Glendale")
function plotdistrictedlev(tests,name)
    edlevnames = Dict(90 => "No HSD", 91 => "HSD", 92 => "Some College", 93 => "College Grad", 94=>"Grad School")
    p = @df tests scatter(:Grade,:Mean_Scale_Score; xlim=(2.5,12), ylim=(2300,2800), title="$(name)\nMath Scores")
    for edlev in keys(edlevnames)
        subs = @subset(tests, :Subgroup_ID .== edlev .&& .! ismissing.(:Grade) .&& .! ismissing.(:Mean_Scale_Score))
        grades = collect(3:11)
        l = loess(Float64.(subs.Grade), Float64.(subs.Mean_Scale_Score))
        p = plot!(grades,map(x -> Loess.predict(l,x), Float64.(grades)); label = edlevnames[edlev],linewidth=3)
    end
    display(p)
    p
end

p1 = plotdistrictedlev(@subset(pusdtests,:Test_Id .== 2),"PUSD")
p2 = plotdistrictedlev(@subset(alhambratests,:Test_Id .== 2),"Alhambra")
p3 = plotdistrictedlev(@subset(glendaletests,:Test_Id .== 2),"Glendale")

plot(p1,p2,p3; layout=(1,3),legend=:topleft,size=(2000,500))

Let's summarize the result via a model

One reason to create a mathematical model is that it can simplify your understanding of a problem. For example a drag coefficient is a kind of summary of the entire pressure field on the surface of an object... integrated together it results in a certain amount of drag.

In our case we will assume that variation from year to year is low, so that we can pool the years together, and get a summary of how quickly students are learning math, by fitting a line through different groups of grades: (3,4,5) (6,7,8) and the level at grade 11 we can summarize the general trend in each school as just 5 numbers (levels and slopes of each group), rather than the details of potentially hundreds of measurements.

We will assume the level and slope of these three lines depends on parents education level only, since we don't have individual student data with covariates where we could fit a more complicated model.

Using Turing.jl for a Bayesian model

@model function schoolmod(school,parented,grade,meanscore,nschools,nedlev)

    c4avg ~ Normal(2450.0,200.0)
    c7avg ~ Normal(2500.0,200.0)
    c11avg ~ Normal(2550.0,200.0)

    m4avg ~ Normal(0.0,100.0)
    m7avg ~ Normal(0.0,100.0)
    msd ~ Gamma(10.0,20.0/9.0)
    csd ~ Gamma(10.0,20.0/9.0)
    c4 ~ MvNormal(repeat([0.0],nschools),csd^2*I(nschools))
    c7 ~ MvNormal(repeat([0.0],nschools),csd^2*I(nschools))
    c11 ~ MvNormal(repeat([0.0],nschools),csd^2*I(nschools))
    m4 ~ MvNormal(repeat([0.0],nschools),msd^2*I(nschools))
    m7 ~ MvNormal(repeat([0.0],nschools),msd^2*I(nschools))

    edlevmult ~ filldist(Gamma(20.0,1.0/19),nschools)

    cedlev4 ~ MvNormal(repeat([0.0],nedlev),Diagonal([50.0,50.0,50.0,50.0,50.0].^2))
    cedlev7 ~ MvNormal(repeat([0.0],nedlev),Diagonal([50.0,50.0,50.0,50.0,50.0].^2))
    cedlev11 ~ MvNormal(repeat([0.0],nedlev),Diagonal([50.0,50.0,50.0,50.0,50.0].^2))

    medlev4 ~ MvNormal(repeat([0.0],nedlev),Diagonal([20.0,20.0,20.0,20.0,20.0].^2))
    medlev7 ~ MvNormal(repeat([0.0],nedlev),Diagonal([20.0,20.0,20.0,20.0,20.0].^2))

    s ~ Gamma(10.0,50/10.0)


    preds = zeros(eltype(s),length(meanscore))
    for (i,(s,ed,g,sc)) in Iterators.enumerate(Iterators.zip(school,parented,grade,meanscore))
        if ed  == 1 # base case
            med4 = med7 = ced4 = ced7 = ced11 = zero(eltype(cedlev4))
        else
            ced4 = cedlev4[ed] * edlevmult[s]
            ced7 = cedlev7[ed] * edlevmult[s]
            ced11 = cedlev11[ed] * edlevmult[s]
            med4 = medlev4[ed] * edlevmult[s]
            med7 = medlev7[ed] * edlevmult[s]
        end
        if g in (3,4,5)
            preds[i] = (c4avg + c4[s] + ced4) + (m4avg + m4[s] + med4) * (g-4)
        elseif g in (6,7,8)
            preds[i] = (c7avg + c7[s] + ced7) + (m7avg + m7[s] + med7) * (g-7)
        elseif g == 11
            preds[i] = (c11avg + c11[s] + ced11)
        end
    end
    meanscore ~ MvNormal(preds,s^2 * I(length(preds)))
end

edlevtests = @chain begin @subset(testscores,.!ismissing.(:Mean_Scale_Score) .&& :Test_Id .== 2 .&& :School_Code .!= 0 .&& in.(:Subgroup_ID,Ref((90,91,92,93,94))))
    @transform(:Edlev = :Subgroup_ID .- 89)
    @transform(:Score = Float64.(:Mean_Scale_Score))
end

schoolids = map(Pair,edlevtests.District_Code,edlevtests.School_Code)
uniqueids = unique(schoolids)
schoolnames = Dict()
schooldists = Dict()
schoolgrades = Dict()

for (i,k) in Iterators.enumerate(uniqueids)
    schoolnames[i] = "Unknown"
    schoolnames[k] = "Unknown"
    for g in 1:11
        schoolgrades[k => g] = false
    end
    schooldists[k] = "Unknown"
    schooldists[i] = "Unknown"
end

for r in eachrow(entities)
    schoolnames[r.District_Code => r.School_Code] = r.School_Name
    schooldists[r.District_Code => r.School_Code] = r.District_Name
    for grade in 1:11
        schoolgrades[(r.District_Code => r.School_Code) => grade ] = false
    end
end

for i in 1:length(uniqueids)
    schooldists[i] = schooldists[uniqueids[i]]
end

for r in eachrow(testscores)
    schoolgrades[(r.District_Code => r.School_Code) => r.Grade ] = true
end

schoolDict = Dict()
merge!(schoolDict,Dict(map(Pair,uniqueids,1:length(uniqueids))))
merge!(schoolDict,Dict(map(Pair,1:length(uniqueids),uniqueids)))

for i in 1:length(uniqueids)
    schoolnames[i] = haskey(schoolnames,schoolDict[i]) ? schoolnames[schoolDict[i]] : "Unknown";
end

smod = schoolmod(map(x -> schoolDict[x],schoolids),edlevtests.Edlev,edlevtests.Grade,edlevtests.Score,length(uniqueids),5)

Turing.setadbackend(:reversediff)
Turing.setrdcache(true)

if ispath("cache/mcmcchain.dat")
    ch = deserialize("cache/mcmcchain.dat")
else
    ch = sample(smod,NUTS(900,.8),MCMCThreads(),500,2)
end

"Done sampling model..."
"Done sampling model..."

Now that we have fit our model, we have hundreds of samples of hundreds of parameters. Let's take a look at their distribution

for sch in 1:length(uniqueids)
    dist = schooldists[sch]
    p = plot(; linewidth=3, title = "$dist : $(schoolnames[sch])", xlim=(2.5,12),ylim=(2300,2700),legend=false)
    c4avg = mean(ch[:,Symbol("c4avg"),:])
    m4avg = mean(ch[:,Symbol("m4avg"),:])
    c7avg = mean(ch[:,Symbol("c7avg"),:])
    m7avg = mean(ch[:,Symbol("m7avg"),:])
    c11avg = mean(ch[:,Symbol("c11avg"),:])
    edlevcolors = colorschemes[:tol_light] # a RG colorblind friendly pallette
    has4 = schoolgrades[schoolDict[sch] => 4] 
    has7 = schoolgrades[schoolDict[sch] => 7] 
    has11 = schoolgrades[schoolDict[sch] => 11] 
    for ed in 1:5

        c4 = mean(ch[:,Symbol("c4[$sch]"),:] + ch[:,Symbol("cedlev4[$ed]"),:] .* ch[:,Symbol("edlevmult[$sch]"),:]) + c4avg
        m4 = mean(ch[:,Symbol("m4[$sch]"),:] + ch[:,Symbol("medlev4[$ed]"),:] .* ch[:,Symbol("edlevmult[$sch]"),:]) + m4avg
        c7 = mean(ch[:,Symbol("c7[$sch]"),:] + ch[:,Symbol("cedlev7[$ed]"),:] .* ch[:,Symbol("edlevmult[$sch]"),:]) + c7avg
        m7 = mean(ch[:,Symbol("m7[$sch]"),:] + ch[:,Symbol("medlev7[$ed]"),:] .* ch[:,Symbol("edlevmult[$sch]"),:]) + m7avg 
        c11 = mean(ch[:,Symbol("c11[$sch]"),:] + ch[:,Symbol("cedlev11[$ed]"),:] .* ch[:,Symbol("edlevmult[$sch]"),:]) + c11avg
        
        al = has4 ? 1.0 : .1
        p = plot!([3,5], [c4-m4,c4+m4]; linewidth=6, color=edlevcolors[ed], xlim=(2.5,12),ylim=(2300,2800),legend=false, alpha=al)
        al = has7 ? 1.0 : .1
        p = plot!([6,8], [c7-m7,c7+m7]; linewidth=6, color=edlevcolors[ed],alpha=al)
        al = has11 ? 1.0 : .1
        p = scatter!([11], [c11]; markersize=6,alpha=al,color=edlevcolors[ed])
        for i in 1:10
            edmult =  ch[i,Symbol("edlevmult[$sch]"),1]
            j = rand(1:length(ch))
            c = ch[j,Symbol("c4[$sch]"),1] + ch[j,Symbol("cedlev4[$ed]"),1] * edmult + c4avg
            m = ch[j,Symbol("m4[$sch]"),1] + ch[j,Symbol("medlev4[$ed]"),1] * edmult + m4avg
            p = plot!([3,5],[c - m, c + m],label=false,alpha=.2,color=edlevcolors[ed])

            c =  ch[j,Symbol("c7[$sch]"),1] + ch[j,Symbol("cedlev7[$ed]"),1] * edmult + c7avg
            m = ch[j,Symbol("m7[$sch]"),1] + ch[j,Symbol("medlev7[$ed]"),1] * edmult + m7avg
            p = plot!([6,8],[c - m, c + m],label=false,alpha=.2,color=edlevcolors[ed])
            c11 = ch[j,Symbol("c11[$sch]"),1] + ch[j,Symbol("cedlev11[$ed]"),1] * edmult + c11avg
            p = scatter!([randn()*.2 + 11],[c11],label=false,alpha=.2,color=edlevcolors[ed])
        end
    end
    display(p)
end

Let's take a look at specific coefficient distributions compared across schools:

let pp = []
    for (i,sch) in Iterators.enumerate(uniqueids)
        p1 = density(ch[:,Symbol("c4[$i]"),:]; title = "$(schoolnames[i]) c4",fill=true,alpha=.5,xlim=(-100,100))
        p2 = density(ch[:,Symbol("c7[$i]"),:]; title = "$(schoolnames[i]) c7",fill=true,alpha=.5,xlim=(-100,100))
        p3 = density(ch[:,Symbol("c11[$i]"),:]; title = "$(schoolnames[i]) c11",fill=true,alpha=.5,xlim=(-100,100))
        p4 = density(ch[:,Symbol("m4[$i]"),:]; title = "$(schoolnames[i]) m4",fill=true,alpha=.5,xlim=(-50,50))
        p5 = density(ch[:,Symbol("m7[$i]"),:]; title = "$(schoolnames[i]) m7",fill=true,alpha=.5,xlim=(-50,50))
        p6 = density(ch[:,Symbol("edlevmult[$i]"),:]; title = "$(schoolnames[i]) EdLevMult",fill=true,alpha=.5,xlim=(0,2))
        push!(pp,plot(p1,p2,p3,p4,p5,p6;layout =(1,6),size=(1000,1000),titlefontsize=8))
    end
    for ppp in Iterators.partition(pp,4)
        display(plot(ppp...; size=(1200,1200),layout=(4,1)))
    end
end

Let's take a look at the composition of students in each school

for (i,sch) in Iterators.enumerate(uniqueids)
    schdat = @subset(testscores,:Grade .< 12 .&& :District_Code .== sch[1] .&& :School_Code .== sch[2] .&& isa.(:Students_Tested,Int64) .&& :Test_Id .== 2 .&& in.(:Subgroup_ID, Ref((90,91,92,93,94,95))))
    schdat = @by(schdat,:Subgroup_ID, :ntested = sum(:Students_Tested))
    ntot = sum(schdat.ntested)
    p = @df schdat bar(:Subgroup_ID  , :ntested ./ ntot ; legend=false,title="$(schoolnames[sch])\nFraction at Parent Education Level")
    display(p)
end